In [1]:
import pandas as pd
import numpy as np
In [2]:
import pandas as pd

# Read the Excel file
df = pd.read_excel('Datasets/GasByCounty.xlsx')

# Reshape the DataFrame
# The melt function is used to transform the DataFrame from a wide format to a long format.
# It uses 'County' and 'Sector' as identifier variables and 'Timestamp' as the variable name for the melted column.
# The 'Yearly Data' column contains the values that are being melted.
df = df.melt(id_vars=['County', 'Sector'],
             var_name='Timestamp', value_name='Yearly Data')

# Convert the "Timestamp" column to datetime format representing the year
# Commented out in this code snippet, but you can uncomment it if you want to convert the "Timestamp" column to datetime format.
# The format='%Y' specifies that the expected format of the values in the "Timestamp" column is year-only.
# By default, the pd.to_datetime function tries to infer the format, but specifying it explicitly can help avoid ambiguity.
# The resulting datetime values will represent the year extracted from the original values in the "Timestamp" column.

# Sort the DataFrame by County and Timestamp
# The sort_values function is used to sort the DataFrame by the specified columns.
# In this case, the DataFrame is sorted first by 'County' and then by 'Timestamp'.
df = df.sort_values(['County', 'Timestamp'])

# Print the resulting DataFrame
# The set_index function is used to set the "Timestamp" column as the new index of the DataFrame.
# The resulting DataFrame, df_indexd, will have the "Timestamp" column removed from the DataFrame and used as the index instead.
# This can be useful for indexing and accessing data based on the timestamp values.
df_indexd = df.set_index('Timestamp')

# The resulting DataFrame, df_indexd, will contain the original columns ('County' and 'Sector') along with the 'Yearly Data' column.
# The index will be the "Timestamp" column, which represents the year.
In [3]:
import pandas as pd
import plotly.graph_objects as go
from pmdarima import auto_arima

# Read the Excel file
df = pd.read_excel('Datasets/GasByCounty.xlsx')

# Reshape the DataFrame
# The melt function is used to transform the DataFrame from a wide format to a long format.
# It uses 'County' and 'Sector' as identifier variables and 'Timestamp' as the variable name for the melted column.
# The 'Yearly Data' column contains the values that are being melted.
df = df.melt(id_vars=['County', 'Sector'],
             var_name='Timestamp', value_name='Yearly Data')

# Sort the DataFrame by County and Timestamp
# The sort_values function is used to sort the DataFrame by the specified columns.
# In this case, the DataFrame is sorted first by 'County' and then by 'Timestamp'.
df = df.sort_values(['County', 'Timestamp'])

# Drop rows with missing values (NaN)
# The dropna function is used to remove rows that contain missing values (NaN) from the DataFrame.
# This is done to ensure that only rows with complete data are used for analysis.
df = df.dropna()

# The resulting DataFrame, df, will contain the original columns ('County', 'Sector', 'Timestamp', 'Yearly Data'),
# with any rows containing missing values removed.
In [4]:
# Iterate over each country and sector
for country in df['County'].unique():
    for sector in df['Sector'].unique():
        fig = go.Figure()

        # Get the energy consumption data for the current country and sector
        df_filter = df[(df['County'] == country) & (
            df['Sector'] == sector)][['Timestamp', 'Yearly Data']]
        
        # Convert the "Timestamp" column to datetime format representing the year
        try:
            df_filter['Timestamp'] = pd.to_datetime(df_filter['Timestamp'], format='%Y')
        except ValueError:
            print(f"Skipping country: {country}, sector: {sector} due to invalid timestamp format")
            continue
        
        
        if df_filter.shape[0]>10:
            
            df_filter_index = df_filter.set_index('Timestamp')

            # Prepare the data for modeling
            years = df_filter_index.index
            energy_consumption = df_filter_index.values.flatten()

            # Split the data into training and testing
            # Use all data except the last 5 years for training
            train_data = energy_consumption[:-5]
            test_data = energy_consumption[-5:]  # Use the last 5 years for testing

            # Fit the auto ARIMA model
            model = auto_arima(train_data, seasonal=True)
            model.fit(train_data)

            # Generate predictions
            predictions = model.predict(n_periods=len(test_data))

            # Plot the training data
            fig.add_trace(go.Scatter(
                x=years[:-5], y=train_data, mode='lines+markers', name='Training Data'))

            # Plot the predictions
            fig.add_trace(go.Scatter(
                x=years[-5:], y=test_data, mode='lines+markers', name='Testing Data'))
            fig.add_trace(go.Scatter(
                x=years[-5:], y=predictions, mode='lines+markers', name='Predictions'))

            # Update the layout
            fig.update_layout(title=f'Gas Consumption Forecast Country : {country} : Sector {sector} ',
                            xaxis_title='Year', yaxis_title='Energy Consumption')

            # Show the plot
            fig.show()
    
In [ ]: